
from bs4 import BeautifulSoup
import pandas
import math
from urllib.request import urlopen
from urllib.parse import quote
from datetime import datetime


# Term to search on the Sputnik website
keywords = ["vakcina"]

# output file
myfile = "LINKS_sputnik_rs_vaccine.csv"

# Dates in the format year, month, day
my_start_date = datetime(2020, 12, 1).date()
my_end_date = datetime(2021, 11, 30).date()

# Sputnik domain
domain = "https://rs-lat.sputniknews.com"


url_list = []

for keyword in keywords:

    print("We are doing this keyword: " + keyword)

    #Encasing in quote because this converts to ascii (aka non-Russian characters)
    search_url = domain + "/search/?query=" + quote(keyword.lower())
    html = urlopen(search_url)
    soup = BeautifulSoup(html, parser = "html", features = "lxml")

    #First I get the number of articles so that I know how many times I will have to scroll
    #There are 20 articles per scroll

    num_articles = soup.find("div", {"class": "counter m-visible"}).find("span").text

    print("The number of articles for all time is: " + str(num_articles))

    #Round UP the result of the number of articles divided by 20 because need to reach through the last ones, even if just over a multiple of 20
    cycles = math.ceil(int(num_articles) / 20)

    print("The maximum number of scrolls is: " + str(cycles))

    #Credit for how to deal with scrolling goes to Miranda Lupion.
    # the key is the offset part, which is hidden in the html as it scrolls, embedded near the "next 20 articles" button which is often hidden
    #I have no understanding of why this works, because if you go to a url with the offset, the html definitely does not contain all the necessary links
    #But it does work so hallelujah 

    cycle = 0

    # The goal here is to scroll as many times as necessary, going up one scroll each time
    # BUT we want to break the scroll if we reach the end date we want because otherwise it could take a ton of time to run

    while cycle < cycles: 
        search_url = search_url + "/&offset=" + str(cycle*20)
        html = urlopen(search_url)
        soup = BeautifulSoup(html, parser = "html", features="lxml")
        article_list = soup.find_all("div", {"class": "list__item"})
        for article in article_list:
            full_url = domain + article.find("a", {"class": "list__title"})["href"]
            #This is easier than manipulating the date string text
            # also it makes it so that we don't have to deal with the fact that when the article is published today the texts ays the time and today. Unixtime is there regardless
            unixtime = article.find("div", {"class": "list__date"})["data-unixtime"]
            datetime_obj = datetime.fromtimestamp(int(unixtime))
            datetime_date_obj = datetime_obj.date()
            #If the start date is before the date of the article, need to stop the whole process
            if my_start_date > datetime_date_obj:
                #This only breaks the inner loop.
                x = "stop"
            #If the start date is in the right range, add it to the list
            elif my_start_date <= datetime_date_obj and my_end_date >= datetime_date_obj:
                url_list.append(full_url)
                #This is used to break the outer loop below
                x = "proceed"
                continue
            #If the date is after the end date, then keep going back through the results (don't add it, but don't stop looking either)
            else:
                x = "proceed"
                continue
        #Break out of the while loop entirely
        if x == "stop":
            break
        #Go to the next page if we haven't found the last relevant
        else:
            cycle = cycle + 1
            print("We are on scroll: " + str(cycle))
            continue
    

print("Before removing duplicates we have this many articles: " + str(len(url_list)))

unique_url_list = []

for url in url_list:
    if url not in unique_url_list:
        unique_url_list.append(url)

print("After removing duplicates we have: " + str(len(unique_url_list)))


#This just puts it into a dictionary dataframe rather than a list which makes life somewhat easier because then the column has a label
l = []

for url in unique_url_list:
    d = {}
    d["url"] = url
    l.append(d)

df = pandas.DataFrame(l)
df.to_csv(myfile, encoding = "utf-8-sig")


